Winter Olympics Medals over Time

1. Medal Counts over Time

a)

I have decided to merge all the designations for the same country to match the gdp_pop table. I have done this by recoding NOC to match the designation’s origin country NOC code.This was only done for Russia and Germany since only top 10 countries are being used in the visualizations.

To prevent the issue of counting multiple medals for team events, I have kept distinct values in terms of Sex, Team, Game, and Event (everything else included in distinct() function was to maintain variables from athletes_events)

library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd('C:/1columbia/dv/winterolympics')
athletes_events <- read.csv("./data/athletes_and_events.csv")
gdp_pop <- read.csv("./data/gdp_pop.csv")
noc_regions <- read.csv("./data/noc_regions.csv")

#recoding: all russian teams = Russia, all german teams = Germany

athletes_events$NOC[athletes_events$NOC == "URS"] <- "RUS"
athletes_events$NOC[athletes_events$NOC == "FRG"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "GDR"] <- "GER"
athletes_events$NOC[athletes_events$NOC == "EUA"] <- "GER"

#removing duplicate medals for team-events

athletes <- distinct(athletes_events, Sex, Team, NOC, Games, Year, Season, City, Sport, Event, Medal)
merge1 <- left_join(athletes, noc_regions)
## Joining, by = "NOC"
dataset <- left_join(merge1, gdp_pop, by=c("NOC" = "Code"))

b)

top10winter <- dataset %>%
  drop_na(Medal) %>%
  filter(Season == 'Winter') %>%
  group_by(Country) %>%
  summarize(total = sum(Medal=='Gold')) %>%
  arrange(desc(total)) %>%
  mutate(rank = row_number()) %>%
  filter(rank <= 10) %>%
  ungroup() %>%
  select(Country) %>%
  unlist() %>%
  c()

#total nmber of winter olympics
dataset %>%
  filter(Season== 'Winter',Country %in% top10winter) %>%
  group_by(Country, Year) %>%
  summarize(total = length(unique(Year))) %>%
  ggplot(., aes(x=Country, y=total)) +
  geom_col(fill = 'lightblue') +
  coord_flip() +
  labs(x = "Country",
       y = "# of Winter Olympics",
       title = "Winter Olympics Participation by Country", caption="Source: International Olympic Committee") +
  theme_classic()
## `summarise()` has grouped output by 'Country'. You can override using the `.groups` argument.

This table displays the number of times the top 10 gold-medal-earning countries had participated in the Winter Olympics.

dataset %>%
  drop_na(Medal) %>%
  filter(Season=="Winter",Country %in% top10winter) %>%
  filter(Medal =='Gold') %>%
  group_by(Country, Medal, Year) %>%
  tally() %>%
  ggplot(., aes(x=Year, y=n, group=Country, color = Country)) +
  geom_line() +
  labs(x = "Year",
       y = "# of Gold Medals",
       title = "Gold Medals Earned Over Time", caption="Source: International Olympic Committee") +
  theme_classic()

Line graph displays the gold medals each country has earned every Winter Olympics

dataset %>%
  drop_na(Medal) %>%
  filter(Season== "Winter",Country %in% top10winter) %>%
  group_by(Country, Medal) %>%
  tally() %>%
  ggplot(., aes(x=Country, y=n, fill=Medal )) +
  coord_flip() +
  geom_bar(position="dodge", stat="identity") +
  labs(x = "Team",
       y = "# of Medals Earned",
       title = "Olympic Medals Earned by Country and Medal Type", caption="Source: International Olympic Committee") +
  scale_fill_manual('Medal Type', values=c('tan4', 'gold2', 'snow3')) +
  theme_classic()

This table shows the medal counts by medal type that each team has earned all-time.

2. Medal Counts adjusted by Population, GDP

a) Unadjusted ranking (top 10 in gold medals)

dataset %>%
  filter(Season== 'Winter', Medal== 'Gold', Country %in% top10winter) %>%
  group_by(Country) %>%
  summarize(total = sum(Medal=='Gold')) %>%
  ggplot(., aes(x=Country, y=total)) +
  geom_col(fill = 'gold2') +
  coord_flip() +
  labs(x = "Country",
       y = "# of Gold Medals Earned",
       title = "Winter Olympics Top 10 Gold Medals by Country (Unadjusted Ranking)", caption="Source: International Olympic Committee") +
  theme_classic()

Since we are going by gold medal ranking, this bar chat is essentially the same as the previous visual (grouped bar chart) but with gold medals only.

b) Ranking by GDP per capita

top10winter_gdp_capita <- dataset %>%
  drop_na(GDP.per.Capita, Medal) %>%
  filter(Season == "Winter", Medal=="Gold") %>%
  group_by(Country) %>%
  summarise(medalbygdp = (GDP.per.Capita)/(sum(Medal=='Gold')), .groups='drop') %>%
  distinct(Country, medalbygdp) %>%
  arrange(medalbygdp) %>%
  mutate(rank = row_number()) %>%
  filter(rank<= 10) %>%
  ungroup() %>%
  select(Country) %>%
  unlist() %>%
  c()

dataset %>%
  filter(Season== 'Winter', Medal== 'Gold', Country %in% top10winter_gdp_capita) %>%
  group_by(Country) %>%
  summarize(total = (GDP.per.Capita)/(sum(Medal=='Gold'))) %>%
  ggplot(., aes(x=Country, y=total)) +
  geom_col(fill = 'gold2') +
  coord_flip() +
  labs(x = "Country",
       y = "GDP per Medal",
       title = "Winter Olympics GDP per capita per Gold Medal", caption="Source: International Olympic Committee") +
  theme_classic()
## `summarise()` has grouped output by 'Country'. You can override using the `.groups` argument.

This ranking is done so that highest ranking country is the one with the lowest ratio of Gold Medals per GDP per capita. This is because a ratio approaching 0 means a country has a high medal count with a low GDP per capita. This is under the assumption that greater GDP = greater # of medals.

c) Ranking by Population

top10winter_pop <- dataset %>%
  drop_na(Population, Medal) %>%
  filter(Season == "Winter", Medal=="Gold") %>%
  group_by(Country) %>%
  summarise(medalbypop = (Population)/(sum(Medal=='Gold')), .groups='drop') %>%
  distinct(Country, medalbypop) %>%
  arrange(medalbypop) %>%
  mutate(rank = row_number()) %>%
  filter(rank<= 10) %>%
  ungroup() %>%
  select(Country) %>%
  unlist() %>%
  c()

dataset %>%
  filter(Season== 'Winter', Medal== 'Gold', Country %in% top10winter_pop) %>%
  group_by(Country) %>%
  summarize(total = (Population)/(sum(Medal=='Gold'))) %>%
  ggplot(., aes(x=Country, y=total)) +
  geom_col(fill = 'gold2') +
  coord_flip() +
  labs(x = "Country",
       y = "Population per Medal",
       title = "Winter Olympics Population per Gold Medal", caption="Source: International Olympic Committee") +
  theme_classic()
## `summarise()` has grouped output by 'Country'. You can override using the `.groups` argument.

Liechtenstein only has two golden medals with a population of 37,531.

3. Host Country Advantage

Data preparation for host country advantage visualizations

library(rvest)
## Warning: package 'rvest' was built under R version 4.1.2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(stringr)
library(tidyverse)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
  select(City, Country, Year)

hosts <- select(hosts, Year, Country, City)
hosts <- rename(hosts, HostCountry = Country)
hosts <- rename(hosts, HostCity = City)
hosts$HostCountry[hosts$HostCountry == "Russia[h]"] <- "Russia"
hosts$HostCity[hosts$HostCity == "Innsbruck[g]"] <- "Innsbruck"
data_prep <- filter(dataset, Season == "Winter")
host_dataset <- left_join(data_prep, hosts, by=c("Year" = "Year"))
avg_medals <- host_dataset %>%
  filter(Country %in% hosts$HostCountry, Medal =="Gold", Country != "China") %>%
  group_by(Country) %>%
  summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))


avg_medals_hosting <- host_dataset %>%
  filter(Country == HostCountry, Medal=="Gold") %>%
  group_by(Country) %>%
  summarise(avgtotal = sum(Medal=="Gold")/length(unique(Year)))
  
ggplot(avg_medals_hosting, aes(x=Country, y=avgtotal)) + 
  geom_point(data=avg_medals_hosting, size=3, color="blue") + 
  geom_point(data=avg_medals, aes(x=Country, y=avgtotal), size=3, color="red" ) +
  scale_colour_manual(values=c('Average Gold Medals Earned (Total)'='blue', 'Average Gold Medals Earned (Hosting)'='red')) +
  labs(x = "Country",
       y = "Average Gold Medals",
       title = "Winter Olympics Host Country Advantage (Gold Medals)",caption="Sources: International Olympic Committee, Wikipedia") +
  theme_classic()

You can see that for a majority of hosting countries (7/10), there was a higher average gold medals earned when they had hosted the Olympics compared to their all-time average. The only case in which hosting led to a lower average gold medal attainment was for Germany, but this could partially be due to the fact that Germany’s figure is a combination of five designations over time.

4. Most Successful Athletes

top_medalists <- athletes_events %>%
  drop_na(Medal) %>%
  filter(Season =="Winter") %>%
  group_by(Name) %>%
  summarise(total = sum(Medal!="")) %>%
  arrange(desc(total)) %>%
  mutate(rank = row_number()) %>%
  filter(rank<= 10) %>%
  ungroup() %>%
  select(Name) %>%
  unlist() %>%
  c()

athletes_events %>%
  drop_na(Medal) %>%
  filter(Name %in% top_medalists) %>%
  group_by(Name, Team) %>%
  summarise(total = sum(Medal!="")) %>%
  ggplot(., aes(x=Name, y=total, color=Team, fill=Team)) +
  geom_col() +
  coord_flip() +
  labs(x = "Athlete",
       y = "Total Medals Earned",
       title = "Winter Olympics Top 10 Athletes (Total Medals)", caption="Source: International Olympic Committee") +
  theme_classic()
## `summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.

This barplot highlights athletes with the greatest total medals earned (bronze, silver, gold = 1). The color represents there team designation for the number of medals they had earned.

athletes_events %>%
  drop_na(Medal) %>%
  drop_na(Height) %>%
  drop_na(Weight) %>%
  filter(Season=="Winter") %>%
  group_by(Medal) %>%
  summarise(avgheight= mean(Height), avgweight = mean(Weight)) %>%
  ggplot(., aes(x=reorder(Medal, -avgheight), y=avgheight, fill=Medal)) +
  geom_col() + coord_cartesian(ylim = c(172, 178))+
  scale_y_continuous(breaks = seq(172, 178, by = 2)) +
  labs(x = "Medal Type",
       y = "Average Height (CM)",
       title = "Winter Olympics Average Athlete Height by Medal Type", caption="Source: International Olympic Committee") +
  theme_classic()

There is a thin correlation between earning a higher level medal and athlete height. Keep in mind the y-axis scale of the graph is very small to show the differences in the average heights of each medalist category. There is only about a one centimeter range in average heights! So it could also be said that 175CM is the ideal medalist height(?)

5. Make two plots interactive

#Line Graph with Plotly

library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
int_line <- dataset %>%
  drop_na(Medal) %>%
  filter(Season=="Winter",Country %in% top10winter) %>%
  filter(Medal =='Gold') %>%
  group_by(Country, Medal, Year) %>%
  tally() %>%
  plot_ly(x = ~Year, y= ~n) %>%
  add_lines(alpha = .9, name = ~Country, color = ~Country, hoverinfo = "yes") %>%
  layout(title = "Gold Medals Earned Over Time", 
         xaxis = list(title = "Year", zeroline = FALSE),
         yaxis = list(title = "# of Gold Medals Earned",zeroline = FALSE))

int_line
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

The line graph from question #2 made interactive using plotly. This interactive graph is more insightful because hovering shows the number of gold medals earned by country for each year. This may have otherwise been difficult to discern from the static graphic.

#Bar Graph with ggplotly

int_bar <- dataset %>%
  drop_na(Medal) %>%
  filter(Season== "Winter",Country %in% top10winter) %>%
  group_by(Country, Medal) %>%
  tally() %>%
  ggplot(., aes(x=Country, y=n, fill=Medal )) +
  coord_flip() +
  geom_bar(position="dodge", stat="identity") +
  labs(x = "Team",
       y = "# of Medals Earned",
       title = "Olympic Medals Earned by Country and Medal Type", caption="Source: International Olympic Committee") +
  scale_fill_manual('Medal Type', values=c('tan4', 'gold2', 'snow3')) +
  theme_classic()

ggplotly(int_bar)

Interactive stacked bar graph using the ggplotly wrapper.

##6. Data Table

The data table below shows the total number of each medal type that every Winter Olympic athlete has earned up to 2014.

library(DT)
## Warning: package 'DT' was built under R version 4.1.2
table_data <- athletes_events %>%
  drop_na(Medal) %>%
  filter(Season =="Winter") %>%
  group_by(Name, Medal) %>%
  tally() %>%
  spread(Medal, n)

datatable(table_data) %>%
  formatStyle('Bronze', color = "white", backgroundColor = '#8f7143', fontWeight='bold') %>%
  formatStyle('Silver', color = "white", backgroundColor = 'grey', fontWeight='bold') %>%
  formatStyle('Gold', color = "white", backgroundColor = 'goldenrod', fontWeight='bold')